#Load packages
library(data.table)
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(lmtest)
library(maps)
library(mapdata)
library(readxl)
#Add data GPS
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")
head(country_data)
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))
# Display the cleaned data
cleaned_data
# Display the number of records removed per variable
records_removed_per_variable
country isocode ison region language date
0 0 0 0 0 0
id_gallup wgt patience risktaking posrecip negrecip
0 0 190 634 32 247
altruism trust subj_math_skills gender age
74 163 132 0 276
# List all variables
variable_list <- names(cleaned_data)
# Display the list of variables
print(variable_list)
[1] "country" "isocode" "ison" "region" "language"
[6] "date" "id_gallup" "wgt" "patience" "risktaking"
[11] "posrecip" "negrecip" "altruism" "trust" "subj_math_skills"
[16] "gender" "age"
#select only the variables of interest
cleaned_data <- cleaned_data %>%
select(country, isocode, ison, risktaking, gender, age)
cleaned_data
Calculate Z-Score for Age per Country
cleaned_data <- cleaned_data %>%
group_by(country) %>%
mutate(z_score_age = scale(age))
# Display the new column with Z-Scores per Country
head(cleaned_data)
cleaned_data$agecat <- cut(
cleaned_data$age,
breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
right = FALSE # Left end (inclusive), right end (exclusive)
)
# Display the new column
head(cleaned_data)
# Calculate mean values for risktaking, gender, and age per country
country_means <- cleaned_data %>%
group_by(country) %>%
summarize(
mean_risktaking = mean(risktaking, na.rm = TRUE),
mean_gender = mean(gender, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
mean_z_score_age = mean(z_score_age, na.rm = TRUE),
)
# Merge mean values back to the original dataset
cleaned_data <- merge(cleaned_data, country_means, by = "country", all.x = TRUE)
# Display the updated dataset
head(cleaned_data)
NA
#Hardship-list
excel_path <- "/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/Hardship/Hardship.xlsx"
hardship_values <- read_excel(excel_path) # Read data from the Excel file
New names:
print(colnames(cleaned_data)) # Display column names in cleaned_data
[1] "country" "isocode" "ison" "risktaking" "gender"
[6] "age" "z_score_age" "agecat" "mean_risktaking" "mean_gender"
[11] "mean_age" "mean_z_score_age"
print(colnames(hardship_values)) # Display column names in hardship_data
[1] "country" "mean_homicide" "gdp" "gini" "Infant_mortality"
[6] "life_expect" "hardship" "...8"
cleaned_data <- left_join(cleaned_data, hardship_values, by = "country") # Perform the left_join operation
head(cleaned_data) # Display the updated dataset
#select only the variables of interest
gps_country <- cleaned_data %>%
select(country, isocode, ison, mean_risktaking, mean_gender, mean_age, mean_z_score_age, hardship)
gps_country
#select only the variables of interest
gps_country <- cleaned_data %>%
group_by(country) %>%
summarize(
isocode = first(isocode),
ison = first(ison),
mean_risktaking = mean(risktaking, na.rm = TRUE),
mean_gender = mean(gender, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
hardship = first(hardship)
) %>%
ungroup()
gps_country
# Determine the number of different countries
number_of_countries <- length(unique(cleaned_data$country))
# Display the number of different countries
number_of_countries
[1] 76
#CODE FOR THE PRESENTATION ########### ########## ##########
world_map <- map_data("world") # Create a world map with country borders
recorded_countries <- unique(cleaned_data$country) # Get the list of recorded countries from your cleaned_data
world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded") # Create a new variable indicating whether a country has been recorded or not
# Plot the world map with recorded countries highlighted
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
geom_polygon(color = "white") +
scale_fill_manual(values = c("Recorded" = "darkblue", "Not Recorded" = "lightgrey"), guide = "none") + # Set guide to "none" to remove the legend
theme_void() +
labs(title = "GPS", fill = "Status") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) # Remove legend and center the title
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)
# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)
# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)
# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
Age Range: 15 to 99
cat("Average Age: ", average_age, "\n")
Average Age: 41.73605
cat("Median Age: ", median_age, "\n")
Median Age: 40
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)
# Display the number of participants in the age categories
print(agecat_counts)
15-19 20-29 30-39 40-49 50-59 60-69 70-79 80+
6888 16872 15905 13583 11374 8570 4688 1559
ggplot(cleaned_data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "blue") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
ggplot(cleaned_data, aes(x = country, y = age)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Distribution of Age per Country", x = "Country", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Distribution of Age per Country", x = "Country", y = "Risktaking") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)
# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
Number of Females: 36024
cat("Number of Males: ", gender_counts[2], "\n")
Number of Males: 43415
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)
# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)
# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)
# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
country = rownames(gender_by_country),
total_participants = total_participants,
female = gender_by_country[, "Female"],
male = gender_by_country[, "Male"]
)
# Display the result table
result_table
# Summary statistics for risktaking
risk_summary <- summary(cleaned_data$risktaking)
# Histogram of risktaking
ggplot(cleaned_data, aes(x = risktaking)) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "blue", alpha = 0.7) +
labs(title = "Distribution of Risk-Taking", x = "Risk-Taking Score", y = "Frequency") +
theme_minimal()
# Boxplot of risktaking by gender
ggplot(cleaned_data, aes(x = as.factor(gender), y = risktaking, fill = as.factor(gender))) +
geom_boxplot() +
labs(title = "Risk-Taking by Gender", x = "Gender", y = "Risk-Taking Score") +
scale_x_discrete(labels = c("0" = "male", "1" = "female")) +
theme_minimal() +
guides(fill = FALSE)
# Risk vs age
ggplot(cleaned_data, aes(risktaking, age)) +
geom_point(size = 0.2) +
geom_smooth(method = "lm")
# Boxplot of risktaking by age group
cleaned_data$age_group <- cut(cleaned_data$age, breaks = c(0, 25, 35, 45, 55, Inf),
labels = c("18-25", "26-35", "36-45", "46-55", "56+"))
ggplot(cleaned_data, aes(x = age_group, y = risktaking, fill = age_group)) +
geom_boxplot() +
labs(title = "Risk-Taking by Age Group", x = "Age Group", y = "Risk-Taking Score") +
theme_minimal() +
guides(fill = FALSE)
# Boxplot of risktaking by country
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Risk-Taking by Country", x = "Country", y = "Risk-Taking Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
FOR PRESENTATION #############################
# Skalierung des Z-Scores für das Alter anpassen
cleaned_data$z_score_age_scaled <- 15 * cleaned_data$z_score_age + 42
# Risk vs age with color-coded gender per Country
ggplot(cleaned_data, aes(z_score_age_scaled, risktaking, color = factor(gender))) +
geom_point(size = 0.1) +
geom_smooth(method = "lm") +
geom_vline(xintercept = 42, linetype = "dashed", color = "black", size = 1) + # Vertikale Linie für den Mittelwert
scale_color_manual(values = c("blue", "red"), labels = c("Male", "Female")) +
labs(color = "Gender") +
ggtitle("Risk vs Age GPS") +
xlab("Age (Mean = 42, SD = 15)") +
scale_x_continuous(breaks = seq(0, 100, by = 15), limits = c(0, 100)) + # Anpassung der Intervalle auf der X-Achse
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
# Risktaking vs. Hardship across countries with summarized points
ggplot(cleaned_data, aes(x = hardship, y = mean_risktaking)) +
geom_point(aes(color = isocode), size = 3) +
geom_text(aes(label = isocode), vjust = -0.5, hjust = 0.5, size = 3) +
labs(title = "Risktaking vs. Hardship across Countries GPS",
x = "Hardship", y = "Risktaking") +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
# Risktaking vs. Hardship across countries with summarized points
ggplot(cleaned_data, aes(x = hardship, y = mean_z_score_age)) +
geom_point(aes(color = isocode), size = 3) +
geom_text(aes(label = isocode), vjust = -0.5, hjust = 0.5, size = 3) +
labs(title = "Risktaking vs. Age across Countries GPS",
x = "Hardship", y = "Age") +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
model_age <- lm(hardship ~ mean_z_score_age, data = cleaned_data)
model_gender <- lm(hardship ~ gender, data = cleaned_data)
# Coefficients for age-effect
age_coefficients <- coef(model_age)
# Coefficients for gender-effect
gender_coefficients <- coef(model_gender)
# Skalierung des Z-Scores für das Alter anpassen
cleaned_data$z_score_age_scaled <- 15 * cleaned_data$mean_z_score_age + 42
# Scatterplot for age-effect
ggplot(cleaned_data, aes(x = mean_z_score_age, y = hardship)) +
geom_point(size = 2) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = "Scatterplot of Hardship vs Age",
x = "Age", y = "Hardship") +
theme_minimal()
Laura has to rewrite the code –> Table already included with the lm
# Fit das Modell mit "age"
model_gender <- lm(risktaking ~ age + gender, data = cleaned_data)
# Intercept and Slope for age
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]
summary(model)
Call:
lm(formula = risktaking ~ age * gender, data = cleaned_data)
Residuals:
Min 1Q Median 3Q Max
-2.3288 -0.6770 -0.0426 0.6446 3.0662
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.6413392 0.0131258 48.861 < 2e-16 ***
age -0.0124875 0.0002925 -42.692 < 2e-16 ***
gender -0.1285097 0.0178148 -7.214 5.50e-13 ***
age:gender -0.0021551 0.0003944 -5.464 4.66e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9644 on 79435 degrees of freedom
Multiple R-squared: 0.07001, Adjusted R-squared: 0.06998
F-statistic: 1993 on 3 and 79435 DF, p-value: < 2.2e-16
Laura has to rewrite the code –> Table already included with the lm
Group by country mutate intercept run and save intercept model
# Group the data by country
table_data <- gps_data %>%
group_by(country, isocode) %>%
summarize(
n = n(),
female_percentage = mean(gender == 1) * 100,
mean_age = mean(age, na.rm = TRUE),
age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
mean_risktaking = mean(risktaking, na.rm = TRUE)
)
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# Intercept and slope for age
intercept_age <- 0.5759701
slope_age <- -0.0137902
# Intercept and slope for gender
intercept_gender <- 0.124666
slope_gender <- -0.227338
# Add the intercept and slope information to the table
table_data <- table_data %>%
mutate(
intercept_age = intercept_age,
slope_age = slope_age,
intercept_gender = intercept_gender,
slope_gender = slope_gender,
)
# Display the updated table
table_data
#Preparing for linear regression
# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)
# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
Missing values in 'Country': FALSE
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")
Missing values in 'Risktaking': FALSE
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, risktaking, age)
# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
group_by(country) %>%
do(model = lm(risktaking ~ age, data = .)) %>%
summarize(
country = first(country),
intercept = summary(model)$coefficients[1],
slope = summary(model)$coefficients[2],
r_squared = summary(model)$r.squared
)
# Display regression results for each country
print(regression_results)
ggplot(data = regression_results, aes(x = country, y = intercept)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
high_intercept_countries <- subset(regression_results, intercept > 0.75)
# View the countries with intercept values over 0.75
print(high_intercept_countries)
# Create scatterplots with regression lines for countries with intercept > 0.75
plots <- lapply(high_intercept_countries$country, function(country) {
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking, color = factor(gender))) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
color = "Gender") +
scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
print(p)
})